
# R script for preparing the cleaned raw dataset for ranking analysis
# This file creates three tailored datasets at department/faculty/publication levels

rm(list = ls())

if (!require("tidyverse")) install.packages("tidyverse")
if (!require("readxl")) install.packages("readxl")
if (!require("writexl")) install.packages("writexl")

# Read raw data -----------------------------------------------------------

# Make sure the raw dataset is placed in the same (sub)dir as this script

full_dat <- readRDS("Dataset_raw.RDS")

# Add new variables -------------------------------------------------------

## Years of publication ----
full_dat$year = ifelse(full_dat$year_forthcoming==1, 2023, full_dat$year)

## Years since PhD ----
full_dat$phd_year_passed = full_dat$year - full_dat$phd_year + 1

## Top-5 polisci publications ----
full_dat$polisci_top5 = 0
full_dat$polisci_top5[full_dat$journal=="AMERICAN POLITICAL SCIENCE REVIEW"] <- 2
full_dat$polisci_top5[full_dat$journal %in% c("AMERICAN JOURNAL OF POLITICAL SCIENCE", "JOURNAL OF POLITICS", "INTERNATIONAL ORGANIZATION","WORLD POLITICS")] <- 1

## Recent five years indicator ----
full_dat$year_recent5 = ifelse(full_dat$year>=2018 & full_dat$year<=2022, 1, 0)

## Num. of coauthors ----
full_dat$no_coauthors = full_dat$no_coauthors + 1
full_dat$no_coauthors[is.na(full_dat$no_coauthors)] <- 1

# Add university labels ---------------------------------------------------

# Make sure the university spreadsheet is placed in the same (sub)dir as this script

ls_uni <- readxl::read_xlsx("Dictionary_universities.xlsx") |>
  dplyr::select(region, university, university_abbr) |>
  distinct()

full_dat = full_dat |>
  left_join(ls_uni, by = c("region", "university")) |> 
  dplyr::relocate(university_abbr, .after = university) |>
  dplyr::arrange(region, university, department, name, year)

# Create subset: publication-level data -----------------------------------

dat_publications = full_dat |>
  dplyr::select(region, year, publication = title, cites, journal, publisher) |>
  dplyr::mutate(source = ifelse(is.na(journal), publisher, journal)) |>
  dplyr::distinct() |>
  dplyr::group_by(region, publication) |>
  dplyr::arrange(desc(year), desc(cites), source) |>
  dplyr::slice_head(n=1) |>
  dplyr::ungroup() |>
  dplyr::select(region, publication, year, citations = cites, source)
dat_publications$citations[is.infinite(dat_publications$citations)] <- 0

# Create subset: faculty-level data ---------------------------------------

dat_faculty = full_dat |>
  dplyr::group_by(region, country, university, university_abbr, name) |>
  dplyr::summarise(
    # faculty status: full/part-time
    status = mean(status, na.rm = TRUE),
    # set i: unweighted measures
    ## citations: all and recent
    citations = sum(cites, na.rm = TRUE),
    citations_recent = sum(cites*year_recent5, na.rm = TRUE),
    ## impact: all and recent
    impact = sum(fiveyearjif_historic, na.rm = TRUE),
    impact_recent = sum(fiveyearjif_historic*year_recent5, na.rm = TRUE),
    ## top publications: all and recent
    pub_top = sum(polisci_top5, na.rm = TRUE),
    pub_top_recent = sum(polisci_top5*year_recent5, na.rm = TRUE),
    # set ii: measures weighted by historic impact and coauthor headcount
    ## citations: x historic JIF / no. of coauthors
    citations_historic_ca = sum(cites*jif_historic/no_coauthors, na.rm = TRUE),
    citations_recent_historic_ca = sum(cites*year_recent5*jif_historic/no_coauthors, na.rm = TRUE),
    ## impact: historic JIF / no. of coauthors
    impact_historic_ca = sum(fiveyearjif_historic/no_coauthors, na.rm = TRUE),
    impact_recent_historic_ca = sum(fiveyearjif_historic*year_recent5/no_coauthors, na.rm = TRUE),
    ## top publications: x historic JIF / no. of coauthors
    pub_top_historic_ca = sum(polisci_top5*jif_historic/no_coauthors, na.rm = TRUE),
    pub_top_recent_historic_ca = sum(polisci_top5*year_recent5*jif_historic/no_coauthors, na.rm = TRUE),
  ) |>
  dplyr::ungroup()

# Create subset: department-level data  -----------------------------------

# This requries the fauclty-level data to be pre-computed first 

dat_departments = dat_faculty |>
  dplyr::group_by(region, country, university, university_abbr) |>
  dplyr::summarise(
    dept_size = sum(status,na.rm = TRUE),
    across(.cols= citations:pub_top_recent_historic_ca, .fns = ~sum(.x, na.rm = TRUE))
  ) |>
  dplyr::ungroup() |>
  dplyr::mutate(across( 
    .cols = citations:pub_top_recent_historic_ca, 
    .fns = function(x) x/dept_size,
    .names = "{.col}_pf"
  ))

# Export subsets ----------------------------------------------------------

ls_dats = list(
  "department-level" = dat_departments, 
  "faculty-level" = dat_faculty, 
  "publication-level" = dat_publications
  )

saveRDS(ls_dats, "Dataset_cleaned.RDS")
writexl::write_xlsx(ls_dats, "Dataset_cleaned.xlsx")

# END #
